# Dickstein, Ho, and Mark (2023)
# This script collects household characteristics, choice sets, estimated 
# demand parameters, and estimated price setting parameters.

# * # * # * # * # * # * #
# LOADING DATA          #
# * # * # * # * # * # * #

## Modified Exploded Data ##

# Loading counterfactual sample of household-option dataset
if(!"load_base_data" %in% ls()){load_base_data <- T}
if(load_base_data == T){
  source(paste0(counterfactual_folder, "/library/DA02CreateCounterfactualMedExplodData.R"))
}

## Demand Parameters ##

if(load_params == T){
  # Loading estimated parameters for switchers:
  IndMarketEstimates <- fread(paste0(project_folder, "/analysis/estimationcode/specs/", spectouse, "_main/output/solution.csv"))
  ## Kondo.
  IndMarketEstimates <- IndMarketEstimates[, -c(1,2)]

  # Loading the number of beta1, 2, 3, 0 parameters:
  IndMarketlabels <- fread(paste0(project_folder, "/analysis/estimationcode/specs/", spectouse, "_main/output/covar_lens.csv"))
  IndMarSE <- cumsum(IndMarketlabels$x)

  # Loading estimated parameters for switchers:
  SGMarketEstimates <- fread(paste0(project_folder, "/analysis/estimationcode/specs/lowrisk_simplemh_censor0.2_switcher/output/solution.csv"))
  ## Removing un-needed Variables:
  SGMarketEstimates <- SGMarketEstimates[, -c(1,2)]

  # Loading the number of beta1, 2, 3, 0 parameters:
  SGMarketlabels <- fread(paste0(project_folder, "/analysis/estimationcode/specs/lowrisk_simplemh_censor0.2_switcher/output/covar_lens.csv"))
  SGMarSE <- cumsum(SGMarketlabels$x)
}

## Rating Factors ##

# Loading rating factors
RatingFactorDat <- fread(
  "counterfactual_all_subscribers.csv",
  select = c("subscriberid", "year", "RatioSum"))

## Supply Side ##

# Loading supply side data
AdminData <- fread(paste0(project_folder, "/data/orig/carrier_collapsed_mlr_oregon.csv"))

# Loading premium setting coefficients
if(load_params == T){
  SupplySideCoefficients <-fread(paste0(project_folder, "/analysis/tablesandfigures/release/prem_setting/tables/pr_paperfull_best_guess_ra_s_wsimcost.csv"))
}

# * # * # * # * # * # * # * # * # * # * #
# GENERATING IND MARKET SAMPLE a,o,p    #
# * # * # * # * # * # * # * # * # * # * #

# * CLEANING PARAMETER ESTIMATES * #
if(load_params == T){
  # Setting individual market demand parameter values
  NamedBetaList <- list(
    beta1 = as.numeric(IndMarketEstimates[1, 1:IndMarSE[1]]),
    beta2 = as.numeric(IndMarketEstimates[1, (1 + IndMarSE[1]):IndMarSE[2]]),
    beta3 = as.numeric(IndMarketEstimates[1, (1 + IndMarSE[2]):IndMarSE[3]]),
    beta0 = as.numeric(IndMarketEstimates[1, (1 + IndMarSE[3]):IndMarSE[4]])
  )

  # Setting individual market demand parameter names
  names(NamedBetaList[[1]]) <- names(IndMarketEstimates)[1:IndMarSE[1]]
  names(NamedBetaList[[2]]) <- names(IndMarketEstimates)[(1 + IndMarSE[1]):IndMarSE[2]]
  names(NamedBetaList[[3]]) <- names(IndMarketEstimates)[(1 + IndMarSE[2]):IndMarSE[3]]
  names(NamedBetaList[[4]]) <- names(IndMarketEstimates)[(1 + IndMarSE[3]):IndMarSE[4]]
}

# * SUBSETTING EXPLODED DATA * #

#Subsetting to individual market
ExplodedData_Ind <- mod_exploded_data[markettype_old == "Individual"]

# Subsetting exploded data to the sample we want to run analysis on
ExplodedData_Ind <- ExplodedData_Ind[best_guess_ra %in% ravec]
ExplodedData_Ind <- ExplodedData_Ind[year %in% yrvec]

# * CREATING FINAL DATASET * #

# Construct alpha, omega, psi, fixed effect term in the household-option dataset:
ExplodedData_Ind$alpha <- exp(as.matrix(ExplodedData_Ind[, names(NamedBetaList$beta1), with = F]) %*% NamedBetaList$beta1)
ExplodedData_Ind$omega <- exp(as.matrix(ExplodedData_Ind[, names(NamedBetaList$beta2), with = F]) %*% NamedBetaList$beta2)
ExplodedData_Ind$psi <- exp(as.matrix(ExplodedData_Ind[, names(NamedBetaList$beta3), with = F]) %*% NamedBetaList$beta3)
ExplodedData_Ind$fixedeffect_beta0 <- as.matrix(ExplodedData_Ind[, names(NamedBetaList$beta0), with = F]) %*% NamedBetaList$beta0

# Constructing subsidy share:
ExplodedData_Ind[, subsidyshare := 1 - (best_guess_netprem / grossprem), ]

# Constructing a pricing plan dummy. This is the constructed plan, ignoring exchange:
ExplodedData_Ind[, plan_id := ifelse(
  constructed_plan_year == "Outside_Option",
  constructed_plan_year,
  paste0(
    substr(constructed_plan_year, 1, 2),
    substr(constructed_plan_year, 5, 16))), ]

# Lastly, merging in rating factors
ExplodedData_Ind <- merge(
  ExplodedData_Ind,
  RatingFactorDat,
  by.x = c("orig_subs_id", "year"),
  by.y = c("subscriberid", "year"),
  all.x = T)

# Collecting variables to keep in the ExplodedData_Ind:
ExplodedData_Ind_small <- ExplodedData_Ind[, .(
  orig_subs_id,
  year,
  hhid,
  plan_id,
  best_guess_AV_old,
  best_guess_sumrate,
  best_guess_netprem_old,
  grossprem_old,
  sum_concurrent_risk,
  ratingfactor = RatioSum,
  subs = ifelse(plan_id == "Outside_Option", 0, subsidyshare),
  x_av,
  alpha,
  omega,
  psi,
  fixedeffect_beta0,
  kappa =  ifelse(plan_id == "Outside_Option", 1, plan_AV /(x_av * 100))), ]

# * # * # * # * # * # * # * # * # * # * # * # * # * # * # * #
# GENERATING SMALL GROUP MARKET SAMPLE POST-MERGE a,o,p     #
# * # * # * # * # * # * # * # * # * # * # * # * # * # * # * #

#Subsetting to small group market
ExplodedData_SG <- mod_exploded_data[markettype_old == "SmallGroup"]
print(paste0("There are ", length(unique(ExplodedData_SG)), "hhid in the small group market"))

# Subsetting exploded data to the sample we want to run analysis on
ExplodedData_SG <- ExplodedData_SG[best_guess_ra %in% ravec]
ExplodedData_SG <- ExplodedData_SG[year %in% yrvec]
print(paste0("There are ", length(unique(ExplodedData_SG)), "hhid in the small group  market after restricting by ra and yr again?"))
print(ravec)

# * CLEANING PARAMETER ESTIMATES * #

if(load_params == T){
  # Setting SG market demand parameter values
  SGNamedBetaList <- list(
    beta1 = as.numeric(SGMarketEstimates[1, 1:SGMarSE[1]]),
    beta2 = as.numeric(SGMarketEstimates[1, (1 + SGMarSE[1]):SGMarSE[2]]),
    beta3 = as.numeric(SGMarketEstimates[1, (1 + SGMarSE[2]):SGMarSE[3]]),
    beta0 = as.numeric(SGMarketEstimates[1, (1 + SGMarSE[3]):SGMarSE[4]])
  )

  # Setting SG market demand parameter names
  names(SGNamedBetaList[[1]]) <- names(SGMarketEstimates)[1:SGMarSE[1]]
  names(SGNamedBetaList[[2]]) <- names(SGMarketEstimates)[(1 + SGMarSE[1]):SGMarSE[2]]
  names(SGNamedBetaList[[3]]) <- names(SGMarketEstimates)[(1 + SGMarSE[2]):SGMarSE[3]]
  names(SGNamedBetaList[[4]]) <- names(SGMarketEstimates)[(1 + SGMarSE[3]):SGMarSE[4]]
}

# * CREATING FINAL DATASET * #

# Construct alpha, omega, psi, fixed effect term in the household-options dataset:
ExplodedData_SG$alpha <- exp(as.matrix(ExplodedData_SG[, names(SGNamedBetaList$beta1), with = F]) %*% SGNamedBetaList$beta1)
ExplodedData_SG$omega <- exp(as.matrix(ExplodedData_SG[, names(SGNamedBetaList$beta2), with = F]) %*% SGNamedBetaList$beta2)
ExplodedData_SG$psi <- exp(as.matrix(ExplodedData_SG[, names(SGNamedBetaList$beta3), with = F]) %*% SGNamedBetaList$beta3)
ExplodedData_SG$fixedeffect_beta0 <- as.matrix(ExplodedData_SG[, names(SGNamedBetaList$beta0), with = F]) %*% SGNamedBetaList$beta0

# Constructing subsidy share:
ExplodedData_SG[, subsidyshare := 1 - (best_guess_netprem / grossprem), ]

# Lastly, merging in rating factors
ExplodedData_SG<- merge(
  ExplodedData_SG,
  RatingFactorDat,
  by.x = c("orig_subs_id", "year"),
  by.y = c("subscriberid", "year"),
  all.x = T)

# Constructing a pricing plan dummy. This is the constructed plan, ignoring on/off:
ExplodedData_SG[, plan_id := ifelse(
  constructed_plan_year == "Outside_Option",
  constructed_plan_year,
  paste0(substr(constructed_plan_year, 1, 2), substr(constructed_plan_year, 5, 16))), ]

# Collecting variables to keep in the ExplodedData_SG:
ExplodedData_SG[, hhid := subscriberid, ]
ExplodedData_SG_small <- ExplodedData_SG[, .(
  orig_subs_id,
  year,
  hhid = -hhid, # Small group households are assigned negtive hhids.
  plan_id,
  best_guess_AV_old,
  best_guess_sumrate,
  best_guess_netprem_old,
  grossprem_old,
  sum_concurrent_risk,
  ratingfactor = RatioSum,
  subs = ifelse(plan_id == "Outside_Option", 0, subsidyshare),
  x_av,
  alpha,
  omega,
  psi,
  fixedeffect_beta0,
  kappa = ifelse(plan_id == "Outside_Option", 1, plan_AV /(x_av * 100))), ]


# * # * # * # * # * # * # * # * #
# GENERATING PLAN DATASET       #
# * # * # * # * # * # * # * # * #

# In this section, I create a plan-level dataset which includes
# 1. beta_{4j}
# 2. beta_5'A_j

if(load_params == T){
  # Create a payer-year observation level dataset:
  SupplySideData <- data.table(
    expand.grid(payer_id = c("M0013", "M0019", "M0035", "M0062", "M0077", "M0080"),
                year = c("2015", "2016"))
  )

  # Create beta_{5j}'A_j
  names(SupplySideCoefficients) <- c("var", "mod1", "mod2", "mod3", "mod4")

  SupplySideCoefficients[, year := ifelse(grepl("2015", var), "2015", NA), ]
  SupplySideCoefficients[, year := ifelse(grepl("2016", var), "2016", year), ]

  SupplySideCoefficients[, payer_id := ifelse(grepl("M0013", var), "M0013", NA), ]
  SupplySideCoefficients[, payer_id := ifelse(grepl("M0019", var), "M0019", payer_id), ]
  SupplySideCoefficients[, payer_id := ifelse(grepl("M0035", var), "M0035", payer_id), ]
  SupplySideCoefficients[, payer_id := ifelse(grepl("M0062", var), "M0062", payer_id), ]
  SupplySideCoefficients[, payer_id := ifelse(grepl("M0077", var), "M0077", payer_id), ]
  SupplySideCoefficients[, payer_id := ifelse(grepl("M0080", var), "M0080", payer_id), ]

  constant <- as.numeric(SupplySideCoefficients[var == "Constant"]$mod3)
  SupplySideCoefficients2015 <- SupplySideCoefficients[year == "2015" | is.na(year)]
  SupplySideCoefficients2015 <- SupplySideCoefficients2015[ , .(
    fe_beta_5 = sum(as.numeric(mod3)) + constant,
    year = "2015"
  ), by = c("payer_id")]

  SupplySideCoefficients2016 <- SupplySideCoefficients[year == "2016" | is.na(year)]
  SupplySideCoefficients2016 <- SupplySideCoefficients2016[ , .(
    fe_beta_5 = sum(as.numeric(mod3)) + constant,
    year = "2016"
  ), by = c("payer_id")]

  SupplySideFEs <- rbind(SupplySideCoefficients2015, SupplySideCoefficients2016)

  SupplySideData <- merge(
    SupplySideData,
    SupplySideFEs,
    by = c("payer_id", "year"),
    all.x = T
  )

  SupplySideData$beta_4 <- as.numeric(SupplySideCoefficients[var == "Medical costs"]$mod3)
}

PlanData <- unique(rbind(ExplodedData_Ind[, .(plan_id), ], ExplodedData_SG[, .(plan_id), ]))
PlanData[, payer_id := paste0("M00", substr(plan_id, 4, 5)), ]
PlanData[, year := substr(plan_id, 11, 14), ]

# Removing payer 99 plans and one other plan that was not included in regression:
PlanData <- PlanData[!(payer_id == "M0099" | (payer_id == "M0037" & year == 2016))]
ExplodedData_Ind_small <- ExplodedData_Ind_small[!(paste0("M00", substr(plan_id, 4, 5)) == "M0099" | (paste0("M00", substr(plan_id, 4, 5)) == "M0037" & year == 2016))]
ExplodedData_SG_small <- ExplodedData_SG_small[!(paste0("M00", substr(plan_id, 4, 5)) == "M0099" | (paste0("M00", substr(plan_id, 4, 5)) == "M0037" & year == 2016))]

# Merging with supply side data
PlanData <- merge(
  PlanData,
  SupplySideData,
  by = c("payer_id", "year"),
  all.x = T)

# Setting betas to 0 if outside option
if(sum(is.na(PlanData$beta_4)) != 1){ stop("More than just outside option is missing beta4")}
PlanData[, beta_4 := ifelse(is.na(beta_4), 0, beta_4), ]
PlanData[, fe_beta_5 := ifelse(is.na(fe_beta_5), 0, fe_beta_5), ]

# Kondo!
PlanData <- PlanData[, .(
  plan_id,
  beta_4,
  fe_beta_5
)]
